

## #######################################################################################################
## 
## Replication R command file for:
## Kosuke Imai, Bethany Park, and Kenneth Greene "Using the Predicted Responses from List Experiments as
## Explanatory Variables in Regression Models."
##
## Created 18 August 2014
## 
## Contact Bethany Park <bapark@princeton.edu> with questions
## 
## #######################################################################################################

## ###
## NB: the code here uses functions of the R list package available through CRAN at
## http://cran.r-project.org/web/packages/list/index.html
## Version 7.0 of the software is included in this replication archive,
## which was used to confirm that this code replicates the tables and figures in the paper.
## ###


## Table 1 - "Summary of the Responses to List Experiment and Direct Questioning about Vote-selling"
load("mexicoall.rda")

## Direct
table(mexico.all$mex.direct)
prop.table(table(mexico.all$mex.direct))

se.direct <- sqrt(mean(mexico.all$mex.direct) * (1 - mean(mexico.all$mex.direct)) / nrow(mexico.all))
direct.lower.p <- mean(mexico.all$mex.direct) - se.direct*1.96
direct.upper.p <- mean(mexico.all$mex.direct) + se.direct*1.96
cbind(direct.lower.p, mean(mexico.all$mex.direct), direct.upper.p) # mean and confidence interval

## List
table(mexico.all$mex.y.all[mexico.all$mex.t == 1])
table(mexico.all$mex.y.all[mexico.all$mex.t == 0])

table(is.na(mexico.all$mex.y.all[mexico.all$mex.t == 1]))
table(is.na(mexico.all$mex.y.all[mexico.all$mex.t == 0]))

se.list <- sqrt(var(mexico.all$mex.y.all[mexico.all$mex.t == 1], na.rm = T)/561 + var(mexico.all$mex.y.all[mexico.all$mex.t == 0], na.rm = T)/559)
list.est <- mean(mexico.all$mex.y.all[mexico.all$mex.t == 1], na.rm = T) - mean(mexico.all$mex.y.all[mexico.all$mex.t == 0], na.rm = T)
list.lower.p <- list.est - se.direct*1.96
list.upper.p <- list.est + se.direct*1.96
cbind(list.lower.p, list.est, list.upper.p)

## In-text results, Section 2.2
library("list")
library("quadprog")
data(mexico)
test <- ict.test(y = mexico$mex.y.all, treat = mexico$mex.t, J = 3, alpha = 0.05, gms = FALSE)
test

## Figure 1 -- see "simulations_replication.R" in this archive



## Figure 2 -- "The Predicted Probability of Vote-selling and the Estimated Effect of Vote-selling on Turnout from the List Experiment." These are the models reported in the Appendix A2, Tables 2-4.
turnoutreg <- ictreg.joint(formula = mex.y.all ~ mex.male + mex.age + mex.age2 +
                           mex.education +  
                           mex.interest + mex.married +
                           mex.urban +
                           mex.cleanelections + mex.cleanelectionsmiss +
                           mex.havepropoganda +
                           mex.concurrent + mex.wealth + 
                           mex.loyal,
                           data = mexico,
                           treat = "mex.t", outcome = "mex.votecard",
                           J = 3, constrained = TRUE,
                           outcome.reg = "logistic", maxIter = 1000)
summary(turnoutreg) ## Appendix, Table 2
#save(turnoutreg, file = "turnoutreg.RData")

loyal <- mexico[mexico$mex.loyal == 1,]
notloyal <- mexico[mexico$mex.loyal == 0,]

notloyalreg <- ictreg.joint(formula = mex.y.all ~ mex.male + mex.age + mex.age2 +
                            mex.education +  
                            mex.interest + mex.married +
                            mex.wealth + mex.urban + mex.havepropoganda + mex.concurrent,
                            data = notloyal,
                            treat = "mex.t", outcome = "mex.votecard",
                            J = 3, constrained = TRUE,
                            outcome.reg = "logistic", maxIter = 1000)
summary(notloyalreg) ## Appendix, Table 3
#save(notloyalreg, file = "notloyalreg.RData")

loyalreg <- ictreg.joint(formula = mex.y.all ~ mex.male + mex.age + mex.age2 + mex.education +  
                         mex.interest + mex.married +
                         mex.wealth + mex.urban + mex.havepropoganda + mex.concurrent, data = loyal,
                         treat = "mex.t", outcome = "mex.votecard", J = 3, constrained = TRUE,
                         outcome.reg = "logistic", maxIter = 1000)
summary(loyalreg) ## Appendix, Table 4
#save(loyalreg, file = "loyalreg.RData")

## Predictions for turnout
loyalpred <- predict.ictreg.joint(loyalreg, se.fit = TRUE, interval = "confidence", level = 0.95,
                                  avg = TRUE, sensitive.value = "both", sensitive.diff = TRUE, return.draws = TRUE,
                                  predict.sensitive = TRUE)
loyalpred$fit
#save(loyalpred, file = "loyalpred.RData")
                                       
notloyalpred <- predict.ictreg.joint(notloyalreg, se.fit = TRUE, interval = "confidence", level = 0.95,
                                     avg = TRUE, sensitive.value = "both", sensitive.diff = TRUE, return.draws = TRUE,
                                     predict.sensitive = TRUE)
notloyalpred$fit
#save(notloyalpred, file = "notloyalpred.RData")

turnoutpred <- predict.ictreg.joint(turnoutreg, se.fit = TRUE, interval = "confidence", level = 0.95,
                           avg = TRUE, sensitive.value = "both", sensitive.diff = TRUE, return.draws = TRUE)
turnoutpred$fit
#save(turnoutpred, file = "turnoutpred.RData")

####### To use our exact models and predictions and replicate the figures exactly, load the following items.
####### If not, proceed with your own run of the models (using the code given above).
load("turnoutreg.RData")
load("notloyalreg.RData")
load("loyalreg.RData")
load("notloyalpred.RData")
load("loyalpred.RData")
load("turnoutpred.RData")
#######
#######

## Difference in differences
diffindiff <- notloyalpred$sens.diff - loyalpred$sens.diff
mean.diffindiff <- mean(diffindiff)
lower.diffindiff <- quantile(diffindiff, probs = 0.025)
upper.diffindiff <- quantile(diffindiff, probs = 0.975)
cbind(lower.diffindiff, mean.diffindiff, upper.diffindiff)

## Overall difference
predict.diff.notloyal <- notloyalpred$draws.predict[[2]] - notloyalpred$draws.predict[[1]] # Predicted difference for each not loyal person
predict.diff.loyal <- loyalpred$draws.predict[[2]] - loyalpred$draws.predict[[1]] # Predicted difference for each loyal person
alldiff <- rbind(predict.diff.notloyal, predict.diff.loyal)
alldiff.mean <- apply(alldiff, 2, mean) # take mean over all observations
avgdiff <- mean(alldiff.mean)
avgdiff.lower <- quantile(alldiff.mean, probs = 0.025)
avgdiff.upper <- quantile(alldiff.mean, probs = 0.975)

## Predictions for vote-selling
loyalpred$fitsens
notloyalpred$fitsens

## Difference in vote-selling...
predict.diff.selling <- notloyalpred$draws.mean.sens - loyalpred$draws.mean.sens ## nonloyal- loyal
simple.mean <- mean(predict.diff.selling, na.rm = TRUE)
simple.ci.lower <- quantile(predict.diff.selling, probs = .025, na.rm = TRUE)
simple.ci.upper <- quantile(predict.diff.selling, probs = .975, na.rm = TRUE)

simple.data.selling <- cbind(simple.ci.lower, simple.mean, simple.ci.upper)
simple.data.selling

all.new <- rbind(loyalpred$draws.predict.sens, notloyalpred$draws.predict.sens) #a matrix where rows are all observations, columns are simulations
draws.mean.all <- apply(all.new, 2, mean) ## Take mean over all observations
avgmean <- mean(draws.mean.all)
avglower <- quantile(draws.mean.all, probs = 0.025)
avgupper <- quantile(draws.mean.all, probs = 0.975)

#plot -- loyalists not more likely to have sold their votes
lower <- c(avglower, loyalpred$fitsens[2], notloyalpred$fitsens[2], simple.data.selling[1])
means <- c(avgmean, loyalpred$fitsens[1], notloyalpred$fitsens[1], simple.data.selling[2])
upper <- c(avgupper, loyalpred$fitsens[3], notloyalpred$fitsens[3], simple.data.selling[3])

par(mfrow = c(1,2))
x <- c(1,2,3,4)
plot(x, means, pch = 20, type = "p", ylim = c(-0.6, 0.6), xlim = c(0.5, 4.5),
     main = "Party Support and Vote-Selling",
     xaxt = "n", xlab = "", bty = "n",
     ylab = "Predicted Probability of Vote-Selling")
for(i in 1:4){
    lines(c(x[i], x[i]), c(lower[i], upper[i]))
}
abline(h = 0, lty = 3)
text(c(1,2,3,4), c(0.1, 0.05, 0.15, -0.30), labels = c("Overall", "Supporters", "Non-supporters", "Difference"))
text(4, -0.40, labels = "(Non-supporters -\nSupporters)")
#plot -- among loyalists, those who sold vote less likely to turnout; among non-loyalists, those who sold vote less likely to turnout; differences of effect of vote-selling not different in two cases
lower.diff <- c(avgdiff.lower, loyalpred$fit[3,2], notloyalpred$fit[3,2], lower.diffindiff)
means.diff <- c(avgdiff, loyalpred$fit[3,1], notloyalpred$fit[3,1], mean.diffindiff)
upper.diff <- c(avgdiff.upper, loyalpred$fit[3,3], notloyalpred$fit[3,3], upper.diffindiff)
plot(x, means.diff, pch = 20, type = "p", ylim = c(-0.6, 0.6), xlim = c(0.5, 4.5),
     main = "Vote-Selling and Turnout",
     xaxt = "n", xlab = "", bty = "n",
     ylab = "Estimated Effect of Vote-Selling on Turnout")
for(i in 1:4){
    lines(c(x[i], x[i]), c(lower.diff[i], upper.diff[i]))
}
abline(h = 0, lty = 3)
text(c(1,2,3,4), c(-0.04, -0.07, 0.05, -0.30), labels = c("Overall", "Supporters", "Non-supporters", "Difference"))
text(4, -0.40, labels = "(Non-supporters -\nSupporters)")




## Figure 3 -- Predicted Turnout and the Estimated Effect of Vote-selling on Turnout from the Direct Question and the List Experiment. These models are reported in Appendix A2, Tables 2 and 5.

## List experiment

avglist <- mean(c(turnoutpred$draws.mean[[1]], turnoutpred$draws.mean[[2]]))
avglist.lower <- quantile(c(turnoutpred$draws.mean[[1]], turnoutpred$draws.mean[[2]]), probs = .025)
avglist.upper <- quantile(c(turnoutpred$draws.mean[[1]], turnoutpred$draws.mean[[2]]), probs = .975)

#plot 
lower.list <- c(avglist.lower, turnoutpred$fit[2, 2], turnoutpred$fit[1,2], turnoutpred$fit[3,2])
means.list <- c(avglist, turnoutpred$fit[2, 1], turnoutpred$fit[1,1], turnoutpred$fit[3,1])
upper.list <- c(avglist.upper, turnoutpred$fit[2, 3], turnoutpred$fit[1,3], turnoutpred$fit[3,3])
x <- c(1,2,3,4)

## Direct question (model in Table 5, Appendix A2)
turnout <- glm(mex.votecard ~ mex.male + mex.age + mex.age2 +
                           mex.education +  
                           mex.interest + mex.married +
                           mex.urban +
                           mex.cleanelections + mex.cleanelectionsmiss +
                           mex.havepropoganda +
                           mex.concurrent + mex.wealth + 
                           mex.loyal + mex.direct - 1, data = mexico, family = binomial(link = "logit"))
summary(turnout)


var.matrix <- vcov(turnout)
coef.matrix <- coef(turnout)
library("MASS")
logistic <- function(x) exp(x)/(1+exp(x))

## Create datasets -- vary direct, loyal ID
cov.names <- list("male", "age", "age2", "education", "interest", "married", "urban", "cleanelections", "cleanelectionmiss", "havepropoganda", "concurrent", "wealth", "loyal", "direct")
covariates <- list(mexico$mex.male, mexico$mex.age, mexico$mex.age2, mexico$mex.education, mexico$mex.interest, mexico$mex.married, mexico$mex.urban, mexico$mex.cleanelections, mexico$mex.cleanelectionsmiss, mexico$mex.havepropoganda, mexico$mex.concurrent, mexico$mex.wealth, mexico$mex.loyal, mexico$mex.direct)

for (i in 1:length(cov.names)) {
  covariates[[i]] <- as.matrix(covariates[[i]])
  colnames(covariates[[i]]) <- cov.names[[i]]
}
cov.data <- data.frame(covariates)

data1 <- cbind(1) #sold vote
data2 <- cbind(0) #did not sell vote

columns <- c("zrep")

dataset <- list(data1, data2)
for (i in 1:2) {
    colnames(dataset[[i]]) <- columns
    dataset[[i]] <- cbind(dataset[[i]], cov.data)
    dataset[[i]] <- dataset[[i]][c(2:14, 1)]
}

draws.predict <- list()
draws.mean <- list()
mean.direct <- c()
ci.lower.direct <- c()
ci.upper.direct <- c()
for (i in 1:2) {
    draws <- mvrnorm(n = 10000, coef.matrix, var.matrix)
    draws.predict[[i]] <- logistic(as.matrix(dataset[[i]])%*% t(draws))
    draws.mean[[i]] <- apply(draws.predict[[i]], 2, mean, na.rm = TRUE)
    mean.direct[i] <- mean(draws.mean[[i]], na.rm = TRUE)
    ci.lower.direct[i] <- quantile(draws.mean[[i]], probs = .025, na.rm = TRUE)
    ci.upper.direct[i] <- quantile(draws.mean[[i]], probs = .975, na.rm = TRUE)
}
point.estimates.direct <- cbind(ci.lower.direct, mean.direct, ci.upper.direct)
point.estimates.direct

## Difference

predict.diff <- draws.mean[[1]] - draws.mean[[2]] ## sold vote - did not sell vote
simple.mean.direct <- mean(predict.diff, na.rm = TRUE)
simple.ci.lower.direct <- quantile(predict.diff, probs = .025, na.rm = TRUE)
simple.ci.upper.direct <- quantile(predict.diff, probs = .975, na.rm = TRUE)

simple.data.direct <- cbind(simple.ci.lower.direct, simple.mean.direct, simple.ci.upper.direct)
simple.data.direct

avgdirect <- mean(c(draws.mean[[1]], draws.mean[[2]]))
avgdirect.lower <- quantile(c(draws.mean[[1]], draws.mean[[2]]), probs = .025)
avgdirect.upper <- quantile(c(draws.mean[[1]], draws.mean[[2]]), probs = .975)

#plot
lower.direct <- c(avgdirect.lower, ci.lower.direct, simple.data.direct[1])
means.direct <- c(avgdirect, mean.direct, simple.data.direct[2])
upper.direct <- c(avgdirect.upper, ci.upper.direct, simple.data.direct[3])

par(mfrow = c(1,2))
plot(x, means.direct, pch = 20, type = "p", ylim = c(-0.5, 1), xlim = c(0.5, 4.5),
     main = "Direct Question",
     xaxt = "n", xlab = "",
     ylab = "Predicted Probability of Turnout", bty = "n")
for(i in 1:4){
    lines(c(x[i], x[i]), c(lower.direct[i], upper.direct[i]))
}
abline(h = 0, lty = 3)
text(c(1,2,3,3.9), c(1.0, 1.0, 1.0, 0.8), labels = c("Overall", "Vote Sellers", "Non-Sellers", "Difference"))
text(3.9, .65, labels = "(Vote sellers -\nNon-sellers)")

plot(x, means.list, pch = 20, type = "p", ylim = c(-0.5, 1), xlim = c(0.5, 4.5),
     main = "List Experiment",
     xaxt = "n", xlab = "",
     ylab = "Predicted Probability of Turnout", bty = "n")
for(i in 1:4){
    lines(c(x[i], x[i]), c(lower.list[i], upper.list[i]))
}
abline(h = 0, lty = 3)
text(c(1,2,3,3.9), c(1, 1, 1, 0.8), labels = c("Overall", "Vote Sellers", "Non-Sellers", "Difference"))
text(3.9, 0.65, labels = "(Vote sellers -\nNon-sellers)")



## Figure 4 -- "The Predicted Approval Rating of Enrique Pena Nieto (PRI) and the Estimated Effect of Vote-selling on the Rating from the Direct Question and the List Experiment." These are the models reported in the Appendix A3, Tables 6-7.

# List experiment
approvalreg <- ictreg.joint(formula = mex.y.all ~ mex.male + mex.age + mex.age2 +
                            mex.education +
                            mex.interest + mex.married +
                            mex.urban + 
                            mex.cleanelections + mex.cleanelectionsmiss +
                            mex.havepropoganda +
                            mex.wealth + mex.northregion +
                            mex.centralregion + mex.metro + mex.pidpriw2 + mex.pidpanw2 + mex.pidprdw2,
                            data = mexico, treat = "mex.t", outcome = "mex.epnapprove",
                            J = 3, constrained = TRUE,
                            outcome.reg = "linear", maxIter = 1000)
summary(approvalreg)
#save(approvalreg, file = "approvalreg.RData")


## Predicted approval, for those who sold their vote vs. didn't
## Using one step estimator & list experiment
approvalpred <- predict.ictreg.joint(approvalreg, se.fit = TRUE, interval = "confidence", level = 0.95,
                           avg = TRUE, sensitive.value = "both", sensitive.diff = TRUE, return.draws = TRUE)
approvalpred$fit
#save(approvalpred, file = "approvalpred.RData")

####### To use our exact models and predictions and replicate the figures exactly, load the following items.
####### If not, proceed with your own run of the models (using the code given above).
load("approvalreg.RData")
load("approvalpred.RData")
#######
#######

# Direct question
approve <- lm(mex.epnapprove ~ mex.male + mex.age + mex.age2 +
              mex.education +
              mex.interest + mex.married + mex.urban +
              mex.cleanelections + mex.cleanelectionsmiss + mex.havepropoganda +
              mex.wealth + mex.northregion +
              mex.centralregion + mex.metro + mex.pidpriw2 + mex.pidpanw2 + mex.pidprdw2 + mex.direct - 1, data = mexico)
summary(approve)

avgdifflist <- mean(c(approvalpred$draws.mean[[1]], approvalpred$draws.mean[[2]]))
avgdifflist.lower <- quantile(c(approvalpred$draws.mean[[1]], approvalpred$draws.mean[[2]]), probs = .025)
avgdifflist.upper <- quantile(c(approvalpred$draws.mean[[1]], approvalpred$draws.mean[[2]]), probs = .975)

#plot -- people who sold vote have slightly higher opinion of Pena Nieto
lower.list <- c(avgdifflist.lower, approvalpred$fit[2, 2], approvalpred$fit[1,2], approvalpred$fit[3,2])
means.list <- c(avgdifflist, approvalpred$fit[2, 1], approvalpred$fit[1,1], approvalpred$fit[3,1])
upper.list <- c(avgdifflist.upper, approvalpred$fit[2, 3], approvalpred$fit[1,3], approvalpred$fit[3,3])
x <- c(1,2,3,4)

## Predicted approval, for those who sold their vote vs. didn't
## Using direct question
var.matrix <- vcov(approve)
coef.matrix <- coef(approve)

## Create datasets -- vary vote selling
cov.names <- list("male", "age", "age2", "education", "interest", "married", "urban", "cleanelections", "cleanelectionmiss", "havepropoganda", "wealth",  "northregion", "centralregion", "metro", "pidpriw2", "pidpanw2", "pidprdw2")
covariates <- list(mexico$mex.male, mexico$mex.age, mexico$mex.age2, mexico$mex.education, mexico$mex.interest, mexico$mex.married, mexico$mex.urban, mexico$mex.cleanelections, mexico$mex.cleanelectionsmiss, mexico$mex.havepropoganda, mexico$mex.wealth, mexico$mex.northregion, mexico$mex.centralregion, mexico$mex.metro, mexico$mex.pidpriw2, mexico$mex.pidpanw2, mexico$mex.pidprdw2)

for (i in 1:length(cov.names)) {
  covariates[[i]] <- as.matrix(covariates[[i]])
  colnames(covariates[[i]]) <- cov.names[[i]]
}
cov.data <- data.frame(covariates)

data1 <- cbind(1) #sold vote
data2 <- cbind(0) #did not sell vote

columns <- c("zrep")

dataset <- list(data1, data2)
for (i in 1:2) {
    colnames(dataset[[i]]) <- columns
    dataset[[i]] <- cbind(dataset[[i]], cov.data)
    dataset[[i]] <- dataset[[i]][c(2:18, 1)]
}

draws.predict <- list()
draws.mean <- list()
mean.direct <- c()
ci.lower.direct <- c()
ci.upper.direct <- c()

for (i in 1:2) {
    draws <- mvrnorm(n = 10000, coef.matrix, var.matrix)
    draws.predict[[i]] <- as.matrix(dataset[[i]])%*% t(draws)
    draws.mean[[i]] <- apply(draws.predict[[i]], 2, mean, na.rm = TRUE)
    mean.direct[i] <- mean(draws.mean[[i]], na.rm = TRUE)
    ci.lower.direct[i] <- quantile(draws.mean[[i]], probs = .025, na.rm = TRUE)
    ci.upper.direct[i] <- quantile(draws.mean[[i]], probs = .975, na.rm = TRUE)
}
point.estimates.direct <- cbind(ci.lower.direct, mean.direct, ci.upper.direct)
point.estimates.direct

## Difference

predict.diff <- draws.mean[[1]] - draws.mean[[2]] ## sold vote - did not sell vote
simple.mean.direct <- mean(predict.diff, na.rm = TRUE)
simple.ci.lower.direct <- quantile(predict.diff, probs = .025, na.rm = TRUE)
simple.ci.upper.direct <- quantile(predict.diff, probs = .975, na.rm = TRUE)

simple.data.direct <- cbind(simple.ci.lower.direct, simple.mean.direct, simple.ci.upper.direct)
simple.data.direct

avgdiffdirect <- mean(c(draws.mean[[1]], draws.mean[[2]]))
avgdiffdirect.lower <- quantile(c(draws.mean[[1]], draws.mean[[2]]), probs = .025)
avgdiffdirect.upper <- quantile(c(draws.mean[[1]], draws.mean[[2]]), probs = .975)

#plot -- people who did NOT sell vote have slightly higher opinion of Pena Nieto
lower.direct <- c(avgdiffdirect.lower, ci.lower.direct, simple.data.direct[1])
means.direct <- c(avgdiffdirect, mean.direct, simple.data.direct[2])
upper.direct <- c(avgdiffdirect.upper, ci.upper.direct, simple.data.direct[3])

par(mfrow = c(1,2))
plot(x, means.direct, pch = 20, type = "p", ylim = c(-1.8, 9.6), xlim = c(0.5, 4.5),
     main = "Direct Question",
     xaxt = "n", xlab = "",
     ylab = "Predicted Approval, Rating", bty = "n")
for(i in 1:4){
    lines(c(x[i], x[i]), c(lower.direct[i], upper.direct[i]))
}
abline(h = 0, lty = 3)
text(c(1,2,3,3.9), c(7.7, 7.3, 8.2, 2.0), labels = c("Overall", "Vote Sellers", "Non-Sellers", "Difference"))
text(3.9, 1.0, labels = "(Vote sellers -\nNon-sellers)")

plot(x, means.list, pch = 20, type = "p", ylim = c(-1.8, 9.6), xlim = c(0.5, 4.5),
     main = "List Experiment",
     xaxt = "n", xlab = "",
     ylab = "Predicted Approval, Rating", bty = "n")
for(i in 1:4){
    lines(c(x[i], x[i]), c(lower.list[i], upper.list[i]))
}
abline(h = 0, lty = 3)
text(c(1,2,3,3.9), c(8.8, 9, 7.8, 3.4), labels = c("Overall", "Vote Sellers", "Non-Sellers", "Difference"))
text(3.9, 2.4, labels = "(Vote sellers -\nNon-sellers)")



## Table 8: Estimated Coefficients of the Logistic Regression Model, Predicting Answers to the Direct Vote-buying Question
directq <- glm(mex.direct ~ mex.male + mex.age + mex.age2 +
               mex.education + mex.interest + mex.married +
               mex.urban + mex.cleanelections + mex.cleanelectionsmiss +
               mex.havepropoganda + mex.pidpriw2 + mex.pidpanw2 +
               mex.pidprdw2 + mex.concurrent + mex.wealth - 1, data = mexico,
               family = binomial(link = "logit"))
summary(directq)
